# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
import warnings
warnings.filterwarnings('ignore')
# Load the data
df = pd.read_csv('HR_Employee_Attrition.csv')
df.head()
| EmployeeNumber | Gender | Age | Attrition | BusinessTravel | Department | DistanceFromHome | EducationField | EnvironmentSatisfaction | HourlyRate | ... | OverTime | PercentSalaryHike | PerformanceRating | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 41 | Yes | Travel_Rarely | Sales | 1 | Life Sciences | 2 | 94.0 | ... | Yes | 11 | 3.0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 2 | Male | 49 | No | Travel_Frequently | Research & Development | 8 | Life Sciences | 3 | 61.0 | ... | No | 23 | 4.0 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 4 | Male | 37 | Yes | Travel_Rarely | Research & Development | 2 | Other | 4 | 92.0 | ... | Yes | 15 | 3.0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 5 | Female | 33 | No | Travel_Frequently | Research & Development | 3 | Life Sciences | 4 | 56.0 | ... | Yes | 11 | 3.0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 7 | Male | 27 | No | Travel_Rarely | Research & Development | 2 | Medical | 1 | 40.0 | ... | No | 12 | 3.0 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 25 columns
df.shape
(1470, 25)
# get the info of the data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 EmployeeNumber 1470 non-null int64 1 Gender 1470 non-null object 2 Age 1470 non-null int64 3 Attrition 1470 non-null object 4 BusinessTravel 1470 non-null object 5 Department 1470 non-null object 6 DistanceFromHome 1470 non-null int64 7 EducationField 1470 non-null object 8 EnvironmentSatisfaction 1470 non-null int64 9 HourlyRate 1469 non-null float64 10 JobRole 1470 non-null object 11 JobSatisfaction 1470 non-null int64 12 MaritalStatus 1470 non-null object 13 MonthlyIncome 1470 non-null int64 14 NumCompaniesWorked 1470 non-null int64 15 OverTime 1470 non-null object 16 PercentSalaryHike 1470 non-null int64 17 PerformanceRating 1469 non-null float64 18 TotalWorkingYears 1470 non-null int64 19 TrainingTimesLastYear 1470 non-null int64 20 WorkLifeBalance 1470 non-null int64 21 YearsAtCompany 1470 non-null int64 22 YearsInCurrentRole 1470 non-null int64 23 YearsSinceLastPromotion 1470 non-null int64 24 YearsWithCurrManager 1470 non-null int64 dtypes: float64(2), int64(15), object(8) memory usage: 287.2+ KB
df.isna().sum()
EmployeeNumber 0 Gender 0 Age 0 Attrition 0 BusinessTravel 0 Department 0 DistanceFromHome 0 EducationField 0 EnvironmentSatisfaction 0 HourlyRate 1 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 NumCompaniesWorked 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 1 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
df.isnull().values.any()
True
df['HourlyRate'].describe()
count 1469.000000 mean 65.886998 std 20.335725 min 30.000000 25% 48.000000 50% 66.000000 75% 84.000000 max 100.000000 Name: HourlyRate, dtype: float64
# Find the mean
mean_hourly_rate = df['HourlyRate'].mean()
mean_hourly_rate
# impute with mean
df['HourlyRate'] = df['HourlyRate'].fillna(mean_hourly_rate)
# check median of performance rating
median_performancerating = df['PerformanceRating'].median()
# impute it median
df['PerformanceRating'] = df['PerformanceRating'].fillna(median_performancerating)
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| EmployeeNumber | 1470.0 | 1024.865306 | 602.024335 | 1.0 | 491.25 | 1020.5 | 1555.75 | 2068.0 |
| Age | 1470.0 | 36.923810 | 9.135373 | 18.0 | 30.00 | 36.0 | 43.00 | 60.0 |
| DistanceFromHome | 1470.0 | 9.192517 | 8.106864 | 1.0 | 2.00 | 7.0 | 14.00 | 29.0 |
| EnvironmentSatisfaction | 1470.0 | 2.721769 | 1.093082 | 1.0 | 2.00 | 3.0 | 4.00 | 4.0 |
| HourlyRate | 1470.0 | 65.886998 | 20.328802 | 30.0 | 48.00 | 66.0 | 83.75 | 100.0 |
| JobSatisfaction | 1470.0 | 2.728571 | 1.102846 | 1.0 | 2.00 | 3.0 | 4.00 | 4.0 |
| MonthlyIncome | 1470.0 | 6502.931293 | 4707.956783 | 1009.0 | 2911.00 | 4919.0 | 8379.00 | 19999.0 |
| NumCompaniesWorked | 1470.0 | 2.693197 | 2.498009 | 0.0 | 1.00 | 2.0 | 4.00 | 9.0 |
| PercentSalaryHike | 1470.0 | 15.209524 | 3.659938 | 11.0 | 12.00 | 14.0 | 18.00 | 25.0 |
| PerformanceRating | 1470.0 | 3.153741 | 0.360824 | 3.0 | 3.00 | 3.0 | 3.00 | 4.0 |
| TotalWorkingYears | 1470.0 | 11.279592 | 7.780782 | 0.0 | 6.00 | 10.0 | 15.00 | 40.0 |
| TrainingTimesLastYear | 1470.0 | 2.799320 | 1.289271 | 0.0 | 2.00 | 3.0 | 3.00 | 6.0 |
| WorkLifeBalance | 1470.0 | 2.761224 | 0.706476 | 1.0 | 2.00 | 3.0 | 3.00 | 4.0 |
| YearsAtCompany | 1470.0 | 7.008163 | 6.126525 | 0.0 | 3.00 | 5.0 | 9.00 | 40.0 |
| YearsInCurrentRole | 1470.0 | 4.229252 | 3.623137 | 0.0 | 2.00 | 3.0 | 7.00 | 18.0 |
| YearsSinceLastPromotion | 1470.0 | 2.187755 | 3.222430 | 0.0 | 0.00 | 1.0 | 3.00 | 15.0 |
| YearsWithCurrManager | 1470.0 | 4.123129 | 3.568136 | 0.0 | 2.00 | 3.0 | 7.00 | 17.0 |
#Print all of the object data types and their unique values
for column in df.columns:
if df[column].dtype == object:
print(str(column) + ' : ' + str(df[column].unique()))
print(df[column].value_counts())
print("_________________________________________________________________")
Gender : ['Female' 'Male'] Male 882 Female 588 Name: Gender, dtype: int64 _________________________________________________________________ Attrition : ['Yes' 'No'] No 1233 Yes 237 Name: Attrition, dtype: int64 _________________________________________________________________ BusinessTravel : ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel'] Travel_Rarely 1043 Travel_Frequently 277 Non-Travel 150 Name: BusinessTravel, dtype: int64 _________________________________________________________________ Department : ['Sales' 'Research & Development' 'Human Resources'] Research & Development 961 Sales 446 Human Resources 63 Name: Department, dtype: int64 _________________________________________________________________ EducationField : ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree' 'Human Resources'] Life Sciences 606 Medical 464 Marketing 159 Technical Degree 132 Other 82 Human Resources 27 Name: EducationField, dtype: int64 _________________________________________________________________ JobRole : ['Sales Executive' 'Research Scientist' 'Laboratory Technician' 'Manufacturing Director' 'Healthcare Representative' 'Manager' 'Sales Representative' 'Research Director' 'Human Resources'] Sales Executive 326 Research Scientist 292 Laboratory Technician 259 Manufacturing Director 145 Healthcare Representative 131 Manager 102 Sales Representative 83 Research Director 80 Human Resources 52 Name: JobRole, dtype: int64 _________________________________________________________________ MaritalStatus : ['Single' 'Married' 'Divorced'] Married 673 Single 470 Divorced 327 Name: MaritalStatus, dtype: int64 _________________________________________________________________ OverTime : ['Yes' 'No'] No 1054 Yes 416 Name: OverTime, dtype: int64 _________________________________________________________________
df['Gender'].unique()
array(['Female', 'Male'], dtype=object)
df.corr()
| EmployeeNumber | Age | DistanceFromHome | EnvironmentSatisfaction | HourlyRate | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EmployeeNumber | 1.000000 | -0.010145 | 0.032916 | 0.017621 | 0.035508 | -0.046247 | -0.014829 | -0.001251 | -0.012944 | -0.020359 | -0.014365 | 0.023603 | 0.010309 | -0.011240 | -0.008416 | -0.009019 | -0.009197 |
| Age | -0.010145 | 1.000000 | -0.001686 | 0.010146 | 0.024353 | -0.004892 | 0.497855 | 0.299635 | 0.003634 | 0.001904 | 0.680381 | -0.019621 | -0.021490 | 0.311309 | 0.212901 | 0.216513 | 0.202089 |
| DistanceFromHome | 0.032916 | -0.001686 | 1.000000 | -0.016075 | 0.030783 | -0.003669 | -0.017014 | -0.029251 | 0.040235 | 0.027110 | 0.004628 | -0.036942 | -0.026556 | 0.009508 | 0.018845 | 0.010029 | 0.014406 |
| EnvironmentSatisfaction | 0.017621 | 0.010146 | -0.016075 | 1.000000 | -0.049723 | -0.006784 | -0.006259 | 0.012594 | -0.031701 | -0.029548 | -0.002693 | -0.019359 | 0.027627 | 0.001458 | 0.018007 | 0.016194 | -0.004999 |
| HourlyRate | 0.035508 | 0.024353 | 0.030783 | -0.049723 | 1.000000 | -0.071387 | -0.015711 | 0.022378 | -0.009330 | -0.002652 | -0.002300 | -0.008421 | -0.004677 | -0.019649 | -0.024151 | -0.027086 | -0.020289 |
| JobSatisfaction | -0.046247 | -0.004892 | -0.003669 | -0.006784 | -0.071387 | 1.000000 | -0.007157 | -0.055699 | 0.020002 | 0.002297 | -0.020185 | -0.005779 | -0.019459 | -0.003803 | -0.002305 | -0.018214 | -0.027656 |
| MonthlyIncome | -0.014829 | 0.497855 | -0.017014 | -0.006259 | -0.015711 | -0.007157 | 1.000000 | 0.149515 | -0.027269 | -0.017120 | 0.772893 | -0.021736 | 0.030683 | 0.514285 | 0.363818 | 0.344978 | 0.344079 |
| NumCompaniesWorked | -0.001251 | 0.299635 | -0.029251 | 0.012594 | 0.022378 | -0.055699 | 0.149515 | 1.000000 | -0.010238 | -0.014095 | 0.237639 | -0.066054 | -0.008366 | -0.118421 | -0.090754 | -0.036814 | -0.110319 |
| PercentSalaryHike | -0.012944 | 0.003634 | 0.040235 | -0.031701 | -0.009330 | 0.020002 | -0.027269 | -0.010238 | 1.000000 | 0.773550 | -0.020608 | -0.005221 | -0.003280 | -0.035991 | -0.001520 | -0.022154 | -0.011985 |
| PerformanceRating | -0.020359 | 0.001904 | 0.027110 | -0.029548 | -0.002652 | 0.002297 | -0.017120 | -0.014095 | 0.773550 | 1.000000 | 0.006744 | -0.015579 | 0.002572 | 0.003435 | 0.034986 | 0.017896 | 0.022827 |
| TotalWorkingYears | -0.014365 | 0.680381 | 0.004628 | -0.002693 | -0.002300 | -0.020185 | 0.772893 | 0.237639 | -0.020608 | 0.006744 | 1.000000 | -0.035662 | 0.001008 | 0.628133 | 0.460365 | 0.404858 | 0.459188 |
| TrainingTimesLastYear | 0.023603 | -0.019621 | -0.036942 | -0.019359 | -0.008421 | -0.005779 | -0.021736 | -0.066054 | -0.005221 | -0.015579 | -0.035662 | 1.000000 | 0.028072 | 0.003569 | -0.005738 | -0.002067 | -0.004096 |
| WorkLifeBalance | 0.010309 | -0.021490 | -0.026556 | 0.027627 | -0.004677 | -0.019459 | 0.030683 | -0.008366 | -0.003280 | 0.002572 | 0.001008 | 0.028072 | 1.000000 | 0.012089 | 0.049856 | 0.008941 | 0.002759 |
| YearsAtCompany | -0.011240 | 0.311309 | 0.009508 | 0.001458 | -0.019649 | -0.003803 | 0.514285 | -0.118421 | -0.035991 | 0.003435 | 0.628133 | 0.003569 | 0.012089 | 1.000000 | 0.758754 | 0.618409 | 0.769212 |
| YearsInCurrentRole | -0.008416 | 0.212901 | 0.018845 | 0.018007 | -0.024151 | -0.002305 | 0.363818 | -0.090754 | -0.001520 | 0.034986 | 0.460365 | -0.005738 | 0.049856 | 0.758754 | 1.000000 | 0.548056 | 0.714365 |
| YearsSinceLastPromotion | -0.009019 | 0.216513 | 0.010029 | 0.016194 | -0.027086 | -0.018214 | 0.344978 | -0.036814 | -0.022154 | 0.017896 | 0.404858 | -0.002067 | 0.008941 | 0.618409 | 0.548056 | 1.000000 | 0.510224 |
| YearsWithCurrManager | -0.009197 | 0.202089 | 0.014406 | -0.004999 | -0.020289 | -0.027656 | 0.344079 | -0.110319 | -0.011985 | 0.022827 | 0.459188 | -0.004096 | 0.002759 | 0.769212 | 0.714365 | 0.510224 | 1.000000 |
# Visuvalize
plt.figure(figsize=(13,13)) #14in by 14in
sns.heatmap(df.corr(), annot=True, fmt='.1%')
<AxesSubplot:>
'Age', is sightly positively correlated with 'TotalWorkingYears' (0.68), as an employee age increases, their total working years tend to increase as well.
'MonthlyIncome' is highly positively correlated with 'TotalWorkingYears' (correlation coefficient 0.77). This says that employees with more total working years tend to have higher monthly income.
'PercentSalaryHike' is highly positively correlated with 'Performace Rating' (0.77). This say that as the performace rate incraeses their salary also incraeses.
Likewise, 'YearsAtTheCompany' is highly positivly correlated with 'yearsInCurrentRole'( 0.76) as more number of employees work in the same role at the same company in many years.
LEAST IS:
# Attrition count
plt.pie(df['Attrition'].value_counts(),labels=['No', 'Yes'], autopct='%1.1f%%');
plt.show()
#Show the number of employees that left and stayed by age
import matplotlib.pyplot as plt
fig_dims = (12, 4)
fig, ax = plt.subplots(figsize=fig_dims)
#ax = axis
sns.countplot(x='Age', hue='Attrition', data = df, palette="colorblind", ax = ax, edgecolor=sns.color_palette("dark", n_colors = 1));
df.isnull().sum().sum()
0
plot_categorical_variables(df)
# Grouping the Gender and attrition
gender_attrition = df.groupby('Gender')['Attrition'].value_counts(normalize=True).unstack()
gender_attrition
| Attrition | No | Yes |
|---|---|---|
| Gender | ||
| Female | 0.852041 | 0.147959 |
| Male | 0.829932 | 0.170068 |
# 1.Compare attrition by gender
gender_attrition.plot(kind='bar', stacked=False)
plt.xlabel('Gender')
plt.ylabel('Attrition Rate')
plt.title('Attrition by Gender')
plt.show()
# Compare attrition by Buisness Travel
Travel_attrition = df.groupby('BusinessTravel')['Attrition'].value_counts(normalize=False).unstack()
Travel_attrition
Travel_attrition.plot(kind='bar', stacked=False)
plt.xlabel('BusinessTravel')
plt.ylabel('Attrition Rate')
plt.title('Attrition by BusinessTravel')
plt.show()
# Material Status
marital_status_counts = df['MaritalStatus'].value_counts()
marital_status_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Marital Status Distribution')
plt.axis('equal')
plt.show()
# Compare attrition by MaritalStatus
MaritalStatus_attrition = df.groupby('MaritalStatus')['Attrition'].value_counts(normalize=False).unstack()
MaritalStatus_attrition.plot(kind='bar', stacked=False)
plt.xlabel('MaritalStatus')
plt.ylabel('Attrition Rate')
plt.title('Attrition by MaritalStatus')
plt.show()
# Compare attrition by TotalWorkingYears
Workyears_attrition = df.groupby('TotalWorkingYears')['Attrition'].value_counts(normalize=False).unstack()
Workyears_attrition.plot(kind='bar', stacked=True)
plt.xlabel('TotalWorkingYears')
plt.ylabel('Attrition Rate')
plt.title('Attrition by TotalWorkingYears')
plt.show()
# Compare attrition by EducationField
Education_attrition = df.groupby('EducationField')['Attrition'].value_counts(normalize=False).unstack()
Education_attrition.plot(kind='bar', stacked=False)
plt.xlabel('EducationField')
plt.ylabel('Attrition Rate')
plt.title('Attrition by EducationField')
plt.show()
# Compare attrition by YearsSinceLastPromotion
promortion_attrition = df.groupby('YearsSinceLastPromotion')['Attrition'].value_counts(normalize=False).unstack()
promortion_attrition.plot(kind='bar', stacked=False)
plt.xlabel('YearsSinceLastPromotion')
plt.ylabel('Attrition Rate')
plt.title('Attrition by YearsSinceLastPromotion')
plt.show()
# Compare attrition by PerformanceRating
Performace_attrition = df.groupby('PerformanceRating')['Attrition'].value_counts(normalize=False).unstack()
Performace_attrition.plot(kind='bar', stacked=False)
plt.xlabel('PerformanceRating')
plt.ylabel('Attrition Rate')
plt.title('Attrition by PerformanceRating')
plt.show()
sns.catplot(y='JobRole', kind='count', hue='Gender', aspect=2, data=df)
<seaborn.axisgrid.FacetGrid at 0x1ffc4999730>
df.columns
Index(['EmployeeNumber', 'Gender', 'Age', 'Attrition', 'BusinessTravel',
'Department', 'DistanceFromHome', 'EducationField',
'EnvironmentSatisfaction', 'HourlyRate', 'JobRole', 'JobSatisfaction',
'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'OverTime',
'PercentSalaryHike', 'PerformanceRating', 'TotalWorkingYears',
'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
'YearsInCurrentRole', 'YearsSinceLastPromotion',
'YearsWithCurrManager'],
dtype='object')
pivot_table_years_with_manager = df.pivot_table(index='Attrition', values='YearsWithCurrManager', aggfunc='mean')
pivot_table_years_with_manager = pivot_table_years_with_manager.rename(columns={'YearsWithCurrManager': 'AverageYearsWithManager'})
print(pivot_table_years_with_manager)
AverageYearsWithManager Attrition No 4.367397 Yes 2.852321
average_years_with_manager = df.groupby('Attrition')['YearsWithCurrManager'].mean()
# Create a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(x=average_years_with_manager.index, y=average_years_with_manager.values, palette='colorblind')
plt.xlabel('Attrition')
plt.ylabel('Average YearsWithCurrManager')
plt.title('Average YearsWithCurrManager by Attrition')
plt.show()
rate_attrition=df.groupby(['MonthlyIncome','Attrition']).apply(lambda x:x['MonthlyIncome'].count()).reset_index(name='Counts')
rate_attrition['MonthlyIncome']=round(rate_attrition['MonthlyIncome'],-3)
rate_attrition=rate_attrition.groupby(['MonthlyIncome','Attrition']).apply(lambda x:x['MonthlyIncome'].count()).reset_index(name='Counts')
plt.figure(figsize=(10,5))
plt.title('Monthly Income basis counts of People in an Organization')
sns.lineplot(x='MonthlyIncome',y='Counts',hue='Attrition', data=rate_attrition)
plt.show()
pd.crosstab(df['Department'], df['Attrition'])
| Attrition | No | Yes |
|---|---|---|
| Department | ||
| Human Resources | 51 | 12 |
| Research & Development | 828 | 133 |
| Sales | 354 | 92 |
# Attrition Rate = (Number of Employees who Left the Department) / (Total Number of Employees in the Department) * 100
#For Human Resources: Attrition Rate = (12 / (12 + 51)) * 100 = 19.05%
#For Research & Development: Attrition Rate = (133 / (133 + 828)) * 100 = 13.85%
#For Sales: Attrition Rate = (92 / (92 + 354)) * 100 = 20.63%
plt.figure(figsize=(10,5))
plt.title('Department wise Counts of People in an Organization')
sns.countplot(x=df['Department'],hue=df['Attrition'])
plt.show()
pd.crosstab(df['EnvironmentSatisfaction'], df['Attrition'])
| Attrition | No | Yes |
|---|---|---|
| EnvironmentSatisfaction | ||
| 1 | 212 | 72 |
| 2 | 244 | 43 |
| 3 | 391 | 62 |
| 4 | 386 | 60 |
plt.figure(figsize=(10,5))
plt.title('Job Satisfaction level Counts of People in an Organization')
sns.countplot(x=df['JobSatisfaction'],hue=df['Attrition'])
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
pd.crosstab(df['OverTime'], df['Attrition'])
| Attrition | No | Yes |
|---|---|---|
| OverTime | ||
| No | 944 | 110 |
| Yes | 289 | 127 |
plt.figure(figsize=(10, 5))
plt.title('People in an Organization by Overtime and Attrition')
sns.countplot(x='OverTime', hue='Attrition', data=df)
plt.xlabel('OverTime')
plt.ylabel('Count')
plt.show()
# Compare attrition by TotalWorkingYears
Workyears_attrition = df.groupby('NumCompaniesWorked')['Attrition'].value_counts(normalize=False).unstack()
Workyears_attrition.plot(kind='bar', stacked=True)
plt.xlabel('NumCompaniesWorked')
plt.ylabel('Attrition Rate')
plt.title('Attrition by NumCompaniesWorked')
plt.show()
year_attrition = df.groupby('YearsInCurrentRole')['Attrition'].value_counts(normalize=False).unstack()
year_attrition.plot(kind='bar', stacked=False)
plt.xlabel('YearsInCurrentRole')
plt.ylabel('Attrition Rate')
plt.title('Attrition by YearsInCurrentRole')
plt.show()
hike_att = df.groupby(['PercentSalaryHike', 'Attrition']).apply(lambda x: x['MonthlyIncome'].count()).reset_index(name='Counts')
plt.figure(figsize=(10, 5))
plt.title('Count of Hike Percentages people receive in an Organization')
sns.lineplot(x='PercentSalaryHike', y='Counts', hue='Attrition', data=hike_att)
plt.show()
sns.countplot(x=df.MaritalStatus, hue=df.Attrition, palette='Set1', hue_order=['Yes', 'No'])
plt.show()
# CrossTab
pd.crosstab(df.Attrition, df.Gender, margins=True, normalize=True)
# Male Pop/Female Pop = 60/40 =1.5
# Male Attriti/Female Atrr = .10/0.06 =1.67
| Gender | Female | Male | All |
|---|---|---|---|
| Attrition | |||
| No | 0.340816 | 0.497959 | 0.838776 |
| Yes | 0.059184 | 0.102041 | 0.161224 |
| All | 0.400000 | 0.600000 | 1.000000 |
Based on this cross-tabulation, we can infer that:
The majority of employees in the dataset have 'Attrition' as 'No' (approximately 83.88%). The percentage of employees with 'Attrition' as 'Yes' is relatively low (approximately 16.12%). The proportion of Male employees is higher than that of Female employees, both for 'Attrition' as 'Yes' and 'No'.
# Perform cross-tabulation
cross_tab = pd.crosstab(df.Attrition, df.EducationField, margins=True, normalize=True)
cross_tab
| EducationField | Human Resources | Life Sciences | Marketing | Medical | Other | Technical Degree | All |
|---|---|---|---|---|---|---|---|
| Attrition | |||||||
| No | 0.013605 | 0.351701 | 0.084354 | 0.272789 | 0.048299 | 0.068027 | 0.838776 |
| Yes | 0.004762 | 0.060544 | 0.023810 | 0.042857 | 0.007483 | 0.021769 | 0.161224 |
| All | 0.018367 | 0.412245 | 0.108163 | 0.315646 | 0.055782 | 0.089796 | 1.000000 |
# identify and extract the columns in original DataFrame which contain object dtype, These represents Categorical Variable.
df_object = df.select_dtypes(include=['object'])
lstcatcolumns = list(df_object.columns.values)
lstcatcolumns
['Gender', 'Attrition', 'BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus', 'OverTime']
df.select_dtypes(include=['object'])
| Gender | Attrition | BusinessTravel | Department | EducationField | JobRole | MaritalStatus | OverTime | |
|---|---|---|---|---|---|---|---|---|
| 0 | Female | Yes | Travel_Rarely | Sales | Life Sciences | Sales Executive | Single | Yes |
| 1 | Male | No | Travel_Frequently | Research & Development | Life Sciences | Research Scientist | Married | No |
| 2 | Male | Yes | Travel_Rarely | Research & Development | Other | Laboratory Technician | Single | Yes |
| 3 | Female | No | Travel_Frequently | Research & Development | Life Sciences | Research Scientist | Married | Yes |
| 4 | Male | No | Travel_Rarely | Research & Development | Medical | Laboratory Technician | Married | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | Male | No | Travel_Frequently | Research & Development | Medical | Laboratory Technician | Married | No |
| 1466 | Male | No | Travel_Rarely | Research & Development | Medical | Healthcare Representative | Married | No |
| 1467 | Male | No | Travel_Rarely | Research & Development | Life Sciences | Manufacturing Director | Married | Yes |
| 1468 | Male | No | Travel_Frequently | Sales | Medical | Sales Executive | Married | No |
| 1469 | Male | No | Travel_Rarely | Research & Development | Medical | Laboratory Technician | Married | No |
1470 rows × 8 columns
fig_dims = (12,6)
fig = plt.subplots(figsize=fig_dims)
sns.scatterplot(x='NumCompaniesWorked', y='TotalWorkingYears', hue='Attrition', data=df)
<AxesSubplot:xlabel='NumCompaniesWorked', ylabel='TotalWorkingYears'>
numerical = df._get_numeric_data().columns
categorical = set(df.columns) - set(numerical)
def plotHist():
fig, ax =plt.subplots(5,3, figsize=(15,15))
i=0;j=0;k=0
while i<=4:
while j<=2:
sns.distplot(df[numerical[k]], ax=ax[i, j])
j+=1;k+=1
j=0;i+=1;
plt.savefig('distribution_Before_removing_outliers_from_numerical_columns.png')
plt.show()
plotHist()
# checking outliers in numerical columns
num_of_rows = 4
num_of_cols = 4
fig, ax = plt.subplots(num_of_rows, num_of_cols, figsize=(15,15))
print(numerical)
i=0;j=0;k=0;
while i<num_of_rows:
while j<num_of_cols:
sns.boxplot(df[numerical[k]], ax=ax[i, j])
k+=1;j+=1
j=0;i+=1
plt.savefig('before_removing_outliers_from_numerical_columns.png')
plt.show()
df_numeric = df.select_dtypes(include=['int', 'float64'])
df_numeric
| EmployeeNumber | Age | DistanceFromHome | EnvironmentSatisfaction | HourlyRate | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | 1 | 2 | 94.0 | 4 | 5993 | 8 | 11 | 3.0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 2 | 49 | 8 | 3 | 61.0 | 2 | 5130 | 1 | 23 | 4.0 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 4 | 37 | 2 | 4 | 92.0 | 3 | 2090 | 6 | 15 | 3.0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 5 | 33 | 3 | 4 | 56.0 | 3 | 2909 | 1 | 11 | 3.0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 7 | 27 | 2 | 1 | 40.0 | 2 | 3468 | 9 | 12 | 3.0 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 2061 | 36 | 23 | 3 | 41.0 | 4 | 2571 | 4 | 17 | 3.0 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 2062 | 39 | 6 | 4 | 42.0 | 1 | 9991 | 4 | 15 | 3.0 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 2064 | 27 | 4 | 2 | 87.0 | 2 | 6142 | 1 | 20 | 4.0 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 2065 | 49 | 2 | 4 | 63.0 | 2 | 5390 | 2 | 14 | 3.0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 2068 | 34 | 8 | 2 | 82.0 | 3 | 4404 | 2 | 12 | 3.0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 17 columns
# Check the relationship btw the variables by pair plot
sns.pairplot(df_numeric)
<seaborn.axisgrid.PairGrid at 0x1ffdc887a00>
sns.lmplot(x='YearsAtCompany', y='MonthlyIncome', hue='Attrition', data=df)
# Set plot labels
plt.xlabel('Years at Company')
plt.ylabel('Monthly Income')
# Show the plot
plt.show()
fig_dims=(10,5)
fig, axs= plt.subplots(nrows=1,ncols=2, figsize=fig_dims)
sns.scatterplot(x='Age', y='MonthlyIncome', data=df, ax= axs[0])
sns.scatterplot(x='TotalWorkingYears', y='MonthlyIncome', data=df, ax=axs[1])
<AxesSubplot:xlabel='TotalWorkingYears', ylabel='MonthlyIncome'>
# to get the graph seperatly
fig_dims=(10,5)
fig, axs= plt.subplots(nrows=1,ncols=2, figsize=fig_dims)
sns.scatterplot(x='PercentSalaryHike', y='MonthlyIncome', data=df, ax= axs[0])
sns.scatterplot(x='TotalWorkingYears', y='MonthlyIncome', data=df, ax=axs[1])
<AxesSubplot:xlabel='TotalWorkingYears', ylabel='MonthlyIncome'>
# create correlation chart
corr = df_numeric.corr()
round(corr,2)
| EmployeeNumber | Age | DistanceFromHome | EnvironmentSatisfaction | HourlyRate | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EmployeeNumber | 1.00 | -0.01 | 0.03 | 0.02 | 0.04 | -0.05 | -0.01 | -0.00 | -0.01 | -0.02 | -0.01 | 0.02 | 0.01 | -0.01 | -0.01 | -0.01 | -0.01 |
| Age | -0.01 | 1.00 | -0.00 | 0.01 | 0.02 | -0.00 | 0.50 | 0.30 | 0.00 | 0.00 | 0.68 | -0.02 | -0.02 | 0.31 | 0.21 | 0.22 | 0.20 |
| DistanceFromHome | 0.03 | -0.00 | 1.00 | -0.02 | 0.03 | -0.00 | -0.02 | -0.03 | 0.04 | 0.03 | 0.00 | -0.04 | -0.03 | 0.01 | 0.02 | 0.01 | 0.01 |
| EnvironmentSatisfaction | 0.02 | 0.01 | -0.02 | 1.00 | -0.05 | -0.01 | -0.01 | 0.01 | -0.03 | -0.03 | -0.00 | -0.02 | 0.03 | 0.00 | 0.02 | 0.02 | -0.00 |
| HourlyRate | 0.04 | 0.02 | 0.03 | -0.05 | 1.00 | -0.07 | -0.02 | 0.02 | -0.01 | -0.00 | -0.00 | -0.01 | -0.00 | -0.02 | -0.02 | -0.03 | -0.02 |
| JobSatisfaction | -0.05 | -0.00 | -0.00 | -0.01 | -0.07 | 1.00 | -0.01 | -0.06 | 0.02 | 0.00 | -0.02 | -0.01 | -0.02 | -0.00 | -0.00 | -0.02 | -0.03 |
| MonthlyIncome | -0.01 | 0.50 | -0.02 | -0.01 | -0.02 | -0.01 | 1.00 | 0.15 | -0.03 | -0.02 | 0.77 | -0.02 | 0.03 | 0.51 | 0.36 | 0.34 | 0.34 |
| NumCompaniesWorked | -0.00 | 0.30 | -0.03 | 0.01 | 0.02 | -0.06 | 0.15 | 1.00 | -0.01 | -0.01 | 0.24 | -0.07 | -0.01 | -0.12 | -0.09 | -0.04 | -0.11 |
| PercentSalaryHike | -0.01 | 0.00 | 0.04 | -0.03 | -0.01 | 0.02 | -0.03 | -0.01 | 1.00 | 0.77 | -0.02 | -0.01 | -0.00 | -0.04 | -0.00 | -0.02 | -0.01 |
| PerformanceRating | -0.02 | 0.00 | 0.03 | -0.03 | -0.00 | 0.00 | -0.02 | -0.01 | 0.77 | 1.00 | 0.01 | -0.02 | 0.00 | 0.00 | 0.03 | 0.02 | 0.02 |
| TotalWorkingYears | -0.01 | 0.68 | 0.00 | -0.00 | -0.00 | -0.02 | 0.77 | 0.24 | -0.02 | 0.01 | 1.00 | -0.04 | 0.00 | 0.63 | 0.46 | 0.40 | 0.46 |
| TrainingTimesLastYear | 0.02 | -0.02 | -0.04 | -0.02 | -0.01 | -0.01 | -0.02 | -0.07 | -0.01 | -0.02 | -0.04 | 1.00 | 0.03 | 0.00 | -0.01 | -0.00 | -0.00 |
| WorkLifeBalance | 0.01 | -0.02 | -0.03 | 0.03 | -0.00 | -0.02 | 0.03 | -0.01 | -0.00 | 0.00 | 0.00 | 0.03 | 1.00 | 0.01 | 0.05 | 0.01 | 0.00 |
| YearsAtCompany | -0.01 | 0.31 | 0.01 | 0.00 | -0.02 | -0.00 | 0.51 | -0.12 | -0.04 | 0.00 | 0.63 | 0.00 | 0.01 | 1.00 | 0.76 | 0.62 | 0.77 |
| YearsInCurrentRole | -0.01 | 0.21 | 0.02 | 0.02 | -0.02 | -0.00 | 0.36 | -0.09 | -0.00 | 0.03 | 0.46 | -0.01 | 0.05 | 0.76 | 1.00 | 0.55 | 0.71 |
| YearsSinceLastPromotion | -0.01 | 0.22 | 0.01 | 0.02 | -0.03 | -0.02 | 0.34 | -0.04 | -0.02 | 0.02 | 0.40 | -0.00 | 0.01 | 0.62 | 0.55 | 1.00 | 0.51 |
| YearsWithCurrManager | -0.01 | 0.20 | 0.01 | -0.00 | -0.02 | -0.03 | 0.34 | -0.11 | -0.01 | 0.02 | 0.46 | -0.00 | 0.00 | 0.77 | 0.71 | 0.51 | 1.00 |
df.drop(['EmployeeNumber'], axis="columns", inplace=True)
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
<AxesSubplot:>
Job level is strongly correlated with total working years
Monthly income is strongly correlated with Job level
Monthly income is strongly correlated with total working year
Age is stongly correlated with monthly income
# check on Skewness
df.skew()
Age 0.413286 DistanceFromHome 0.958118 EnvironmentSatisfaction -0.321654 HourlyRate -0.031718 JobSatisfaction -0.329672 MonthlyIncome 1.369817 NumCompaniesWorked 1.026471 PercentSalaryHike 0.821128 PerformanceRating 1.921883 TotalWorkingYears 1.117172 TrainingTimesLastYear 0.553124 WorkLifeBalance -0.552480 YearsAtCompany 1.764529 YearsInCurrentRole 0.917363 YearsSinceLastPromotion 1.984290 YearsWithCurrManager 0.833451 dtype: float64
numerical = df._get_numeric_data().columns
# Loop through numerical columns and remove skewness
for col in numerical:
if df[col].skew() > 0.55:
df[col] = np.log1p(df[col])
df.skew()
Age 0.413286 DistanceFromHome -0.029121 EnvironmentSatisfaction -0.321654 HourlyRate -0.031718 JobSatisfaction -0.329672 MonthlyIncome 0.286448 NumCompaniesWorked 0.092896 PercentSalaryHike 0.513543 PerformanceRating 1.921883 TotalWorkingYears -0.622175 TrainingTimesLastYear -1.075852 WorkLifeBalance -0.552480 YearsAtCompany -0.207708 YearsInCurrentRole -0.383498 YearsSinceLastPromotion 0.718805 YearsWithCurrManager -0.357686 dtype: float64
num_of_rows = 4
num_of_cols = 4
fig, ax = plt.subplots(num_of_rows, num_of_cols, figsize=(15,15))
print(numerical)
i=0;j=0;k=0;
while i<num_of_rows:
while j<num_of_cols:
sns.boxplot(df[numerical[k]], ax=ax[i, j])
k+=1;j+=1
j=0;i+=1
plt.show()
Index(['Age', 'DistanceFromHome', 'EnvironmentSatisfaction', 'HourlyRate',
'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
'PercentSalaryHike', 'PerformanceRating', 'TotalWorkingYears',
'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
'YearsInCurrentRole', 'YearsSinceLastPromotion',
'YearsWithCurrManager'],
dtype='object')
# CREATE DUMMI VARIABLE FOR CAT VARIABLE
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
df["Attrition"] = LE.fit_transform(df.Attrition)
df["BusinessTravel"]=LE.fit_transform(df["BusinessTravel"])
df["Department"]=LE.fit_transform(df["Department"])
df["EducationField"]=LE.fit_transform(df["EducationField"])
df["Gender"]=LE.fit_transform(df["Gender"])
df["JobRole"]=LE.fit_transform(df["JobRole"])
df["MaritalStatus"]=LE.fit_transform(df["MaritalStatus"])
df["OverTime"]=LE.fit_transform(df["OverTime"])
Q1 = df['TrainingTimesLastYear'].quantile(0.25)
Q3 = df['TrainingTimesLastYear'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
# Removing the outlier
df = df[(df['TrainingTimesLastYear'] >= lower) & (df['TrainingTimesLastYear'] <= upper)]
Q1 = df['TotalWorkingYears'].quantile(0.25)
Q3 = df['TotalWorkingYears'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
# Removing the outlier
df = df[(df['TotalWorkingYears'] >= lower) & (df['TotalWorkingYears'] <= upper)]
Q1 = df['YearsAtCompany'].quantile(0.25)
Q3 = df['YearsAtCompany'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
# Removing the outlier
df = df[(df['YearsAtCompany'] >= lower) & (df['YearsAtCompany'] <= upper)]
df.duplicated().sum()
0
from sklearn.metrics import confusion_matrix
# split the data.
X = df.drop('Attrition', axis=1)
y = df['Attrition']
# Checking the count of records having heart disease or not(0 and 1)
df["Attrition"].value_counts()
0 1094 1 176 Name: Attrition, dtype: int64
from sklearn.model_selection import train_test_split
# Split the dataset into 70% Training set and 30% Testing set
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.30,random_state = 42)
print(X_train.shape)
print(X_test.shape)
(889, 23) (381, 23)
from sklearn.preprocessing import StandardScaler
# declaring an object of standardscaler class
sc = StandardScaler()
# fit_transform() method first trains the Scaler on dataset and then transforms it between 0 and 1
X_train = sc.fit_transform(X_train)
# transform() method only transforms the dataset based on what it has learnt on the dataset before
X_test = sc.transform(X_test)
# Create an instance
log_reg = LogisticRegression()
# Training
log_reg.fit(X_train,y_train)
LogisticRegression()
# Check for prediction results
y_pred = log_reg.predict(X_test)
# Check for accuray of the model
accuracy_score(y_test,y_pred)
0.884514435695538
## Confusion matrix
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(log_reg, X_test, y_test)
plt.show()
## Compute precision, recall and F1-score
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.88 0.99 0.94 325
1 0.88 0.25 0.39 56
accuracy 0.88 381
macro avg 0.88 0.62 0.66 381
weighted avg 0.88 0.88 0.86 381
The overall attrition rate in the company is around 16.3%
Younger employees, particularly those in their early 30s, tend to have a higher attrition rate compared to older employees.
We can infer that younger employees with lower income, in certain job roles, with lower work-life balance and environment satisfaction scores, and who have spent fewer years at the company are more likely to leave the organization.
People who work overtime are more likelty to leave the company. Hence efforts must be taken and there be more manpower so as to reduce the use of overtime.
The company should review and adjust the salary structure.
Need to understand the concerns and address any issues contributing to attrition.
By adopting these measures, the company can take proactive steps to reduce attrition, improve employee satisfaction, and build a stronger and more committed workforce. Ultimately, this will lead to increased productivity, higher employee morale, and better overall performance for the organization.